- maximize insight into a data set
- detection of mistakes
- detect outliers and anomalies
- test underlying assumptions
- determining relationships among the explanatory variables
import pandas as pd import numpy as np url='https://raw.githubusercontent.com/urosgodnov/datasets/master/winequality-white.csv' data = pd.read_csv(url, sep=";") data.info()
## <class 'pandas.core.frame.DataFrame'> ## RangeIndex: 4898 entries, 0 to 4897 ## Data columns (total 12 columns): ## # Column Non-Null Count Dtype ## --- ------ -------------- ----- ## 0 fixed acidity 4898 non-null float64 ## 1 volatile acidity 4898 non-null float64 ## 2 citric acid 4898 non-null float64 ## 3 residual sugar 4898 non-null float64 ## 4 chlorides 4898 non-null float64 ## 5 free sulfur dioxide 4898 non-null float64 ## 6 total sulfur dioxide 4898 non-null float64 ## 7 density 4898 non-null float64 ## 8 pH 4898 non-null float64 ## 9 sulphates 4898 non-null float64 ## 10 alcohol 4898 non-null float64 ## 11 quality 4898 non-null int64 ## dtypes: float64(11), int64(1) ## memory usage: 459.3 KB
data.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 |
| mean | 6.854788 | 0.278241 | 0.334192 | 6.391415 | 0.045772 | 35.308085 | 138.360657 | 0.994027 | 3.188267 | 0.489847 | 10.514267 | 5.877909 |
| std | 0.843868 | 0.100795 | 0.121020 | 5.072058 | 0.021848 | 17.007137 | 42.498065 | 0.002991 | 0.151001 | 0.114126 | 1.230621 | 0.885639 |
| min | 3.800000 | 0.080000 | 0.000000 | 0.600000 | 0.009000 | 2.000000 | 9.000000 | 0.987110 | 2.720000 | 0.220000 | 8.000000 | 3.000000 |
| 25% | 6.300000 | 0.210000 | 0.270000 | 1.700000 | 0.036000 | 23.000000 | 108.000000 | 0.991723 | 3.090000 | 0.410000 | 9.500000 | 5.000000 |
| 50% | 6.800000 | 0.260000 | 0.320000 | 5.200000 | 0.043000 | 34.000000 | 134.000000 | 0.993740 | 3.180000 | 0.470000 | 10.400000 | 6.000000 |
| 75% | 7.300000 | 0.320000 | 0.390000 | 9.900000 | 0.050000 | 46.000000 | 167.000000 | 0.996100 | 3.280000 | 0.550000 | 11.400000 | 6.000000 |
| max | 14.200000 | 1.100000 | 1.660000 | 65.800000 | 0.346000 | 289.000000 | 440.000000 | 1.038980 | 3.820000 | 1.080000 | 14.200000 | 9.000000 |
data.quality.unique()
## array([6, 5, 7, 8, 4, 3, 9], dtype=int64)
data.quality.value_counts()
## 6 2198 ## 5 1457 ## 7 880 ## 8 175 ## 4 163 ## 3 20 ## 9 5 ## Name: quality, dtype: int64
import seaborn as sns import matplotlib.pyplot as plt sns.heatmap(data.isnull(),cbar=False,yticklabels=False,cmap = 'viridis')
l = data.columns.values
number_of_columns=12
number_of_rows = len(l)-1/number_of_columns
plt.figure(figsize=(number_of_columns,4*number_of_rows))
for i in range(0,len(l)):
plt.subplot(number_of_rows + 1,number_of_columns,i+1)
sns.set_style('whitegrid')
sns.boxplot(data[l[i]],color='green',orient='v')
plt.tight_layout()
plt.figure(figsize=(2*number_of_columns,60))
for i in range(0,len(l)):
plt.subplot(12 + 1,number_of_columns,i+1)
sns.distplot(data[l[i]],kde=True)
k = 12 #number of variables for heatmap cols = data.corr().nlargest(k, 'quality')['quality'].index cm = data[cols].corr() plt.figure(figsize=(10,6))
sns.heatmap(cm, annot=True, cmap = 'viridis')
#full report from pandas_profiling import ProfileReport prof = ProfileReport(data) prof.to_file(output_file='Reportoutput.html') #simple report prof_minimal=ProfileReport(data, minimal=True) prof_minimal.to_file(output_file="Reportoutput_min.html")
url='https://raw.githubusercontent.com/urosgodnov/datasets/master/mtcars.csv' data = pd.read_csv(url)
url<-'https://raw.githubusercontent.com/urosgodnov/datasets/master/winequality-white.csv' data <- read.csv2(url, dec=".") str(data)
## 'data.frame': 4898 obs. of 12 variables: ## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ... ## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ... ## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ... ## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ... ## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ... ## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ... ## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ... ## $ density : num 1.001 0.994 0.995 0.996 0.996 ... ## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ... ## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ... ## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ... ## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
summary(data)
## fixed.acidity volatile.acidity citric.acid residual.sugar ## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600 ## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700 ## Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200 ## Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391 ## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900 ## Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800 ## chlorides free.sulfur.dioxide total.sulfur.dioxide density ## Min. :0.00900 Min. : 2.00 Min. : 9.0 Min. :0.9871 ## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0 1st Qu.:0.9917 ## Median :0.04300 Median : 34.00 Median :134.0 Median :0.9937 ## Mean :0.04577 Mean : 35.31 Mean :138.4 Mean :0.9940 ## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0 3rd Qu.:0.9961 ## Max. :0.34600 Max. :289.00 Max. :440.0 Max. :1.0390 ## pH sulphates alcohol quality ## Min. :2.720 Min. :0.2200 Min. : 8.00 Min. :3.000 ## 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50 1st Qu.:5.000 ## Median :3.180 Median :0.4700 Median :10.40 Median :6.000 ## Mean :3.188 Mean :0.4898 Mean :10.51 Mean :5.878 ## 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40 3rd Qu.:6.000 ## Max. :3.820 Max. :1.0800 Max. :14.20 Max. :9.000
library(tidyverse) data%>%select(quality)%>%distinct()
## quality ## 1 6 ## 2 5 ## 3 7 ## 4 8 ## 5 4 ## 6 3 ## 7 9
data%>%select(quality)%>%count(quality, sort = TRUE)
## quality n ## 1 6 2198 ## 2 5 1457 ## 3 7 880 ## 4 8 175 ## 5 4 163 ## 6 3 20 ## 7 9 5
library(naniar) gg_miss_upset(airquality)
df<-data%>%pivot_longer(cols=fixed.acidity:quality,names_to="variable", values_to = "value")
ggplot(data = df, aes(x=variable, y=value)) +
scale_y_log10() +
geom_point() +
geom_boxplot(outlier.colour='blue')+
coord_flip()
ggplot(df,aes(value)) +
facet_wrap(~ variable, scales = "free") +
geom_histogram()
library(PerformanceAnalytics) chart.Correlation(data)
#inline statistics - better than summary
library(skimr)
skim(data)
#full report
library("dataMaid")
makeDataReport(data,
render = TRUE,
output = "html",
file = "FullReport.html",
replace = TRUE,
reportTitle ="Full report on dataset")
#simple report
library("summarytools")
view(dfSummary(data))
#interactive shiny report
library("explore")
explore(data)
#detailed report
library(DataExplorer)
create_report(data, output_file = "report.html")